In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

air_satisfaction = pd.read_csv('air satisfaction.csv')
air_satisfaction.head()
Out[1]:
id satisfaction_v2 Gender Customer Type Age Type of Travel Class Flight Distance Seat comfort Departure/Arrival time convenient ... Online support Ease of Online booking On-board service Leg room service Baggage handling Checkin service Cleanliness Online boarding Departure Delay in Minutes Arrival Delay in Minutes
0 11112 satisfied Female Loyal Customer 65 Personal Travel Eco 265 0 0 ... 2 3 3 0 3 5 3 2 0 0.0
1 110278 satisfied Male Loyal Customer 47 Personal Travel Business 2464 0 0 ... 2 3 4 4 4 2 3 2 310 305.0
2 103199 satisfied Female Loyal Customer 15 Personal Travel Eco 2138 0 0 ... 2 2 3 3 4 4 4 2 0 0.0
3 47462 satisfied Female Loyal Customer 60 Personal Travel Eco 623 0 0 ... 3 1 1 0 1 4 1 3 0 0.0
4 120011 satisfied Female Loyal Customer 70 Personal Travel Eco 354 0 0 ... 4 2 2 0 2 4 2 5 0 0.0

5 rows × 24 columns

In [2]:
air_satisfaction.shape
Out[2]:
(129880, 24)
In [3]:
air_satisfaction.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129880 entries, 0 to 129879
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   id                                 129880 non-null  int64  
 1   satisfaction_v2                    129880 non-null  object 
 2   Gender                             129880 non-null  object 
 3   Customer Type                      129880 non-null  object 
 4   Age                                129880 non-null  int64  
 5   Type of Travel                     129880 non-null  object 
 6   Class                              129880 non-null  object 
 7   Flight Distance                    129880 non-null  int64  
 8   Seat comfort                       129880 non-null  int64  
 9   Departure/Arrival time convenient  129880 non-null  int64  
 10  Food and drink                     129880 non-null  int64  
 11  Gate location                      129880 non-null  int64  
 12  Inflight wifi service              129880 non-null  int64  
 13  Inflight entertainment             129880 non-null  int64  
 14  Online support                     129880 non-null  int64  
 15  Ease of Online booking             129880 non-null  int64  
 16  On-board service                   129880 non-null  int64  
 17  Leg room service                   129880 non-null  int64  
 18  Baggage handling                   129880 non-null  int64  
 19  Checkin service                    129880 non-null  int64  
 20  Cleanliness                        129880 non-null  int64  
 21  Online boarding                    129880 non-null  int64  
 22  Departure Delay in Minutes         129880 non-null  int64  
 23  Arrival Delay in Minutes           129487 non-null  float64
dtypes: float64(1), int64(18), object(5)
memory usage: 23.8+ MB
In [4]:
air_satisfaction = air_satisfaction.drop(columns=['id'])
air_satisfaction.columns = [col.replace(' ','_').replace('-','_').replace('/','_').lower() for col in air_satisfaction.columns]
In [5]:
categorical_columns = ['satisfaction_v2', 'gender', 'customer_type', 'type_of_travel', 'class',
                       'seat_comfort', 'seat_comfort', 'departure_arrival_time_convenient',
                       'food_and_drink', 'gate_location', 'inflight_wifi_service', 'inflight_entertainment',
                       'online_support', 'ease_of_online_booking', 'on_board_service', 'leg_room_service',
                       'baggage_handling', 'checkin_service', 'cleanliness', 'online_boarding']
for col in categorical_columns:
    air_satisfaction[col] = air_satisfaction[col].astype('category')
In [6]:
air_satisfaction.head()
Out[6]:
satisfaction_v2 gender customer_type age type_of_travel class flight_distance seat_comfort departure_arrival_time_convenient food_and_drink ... online_support ease_of_online_booking on_board_service leg_room_service baggage_handling checkin_service cleanliness online_boarding departure_delay_in_minutes arrival_delay_in_minutes
0 satisfied Female Loyal Customer 65 Personal Travel Eco 265 0 0 0 ... 2 3 3 0 3 5 3 2 0 0.0
1 satisfied Male Loyal Customer 47 Personal Travel Business 2464 0 0 0 ... 2 3 4 4 4 2 3 2 310 305.0
2 satisfied Female Loyal Customer 15 Personal Travel Eco 2138 0 0 0 ... 2 2 3 3 4 4 4 2 0 0.0
3 satisfied Female Loyal Customer 60 Personal Travel Eco 623 0 0 0 ... 3 1 1 0 1 4 1 3 0 0.0
4 satisfied Female Loyal Customer 70 Personal Travel Eco 354 0 0 0 ... 4 2 2 0 2 4 2 5 0 0.0

5 rows × 23 columns

In [7]:
air_satisfaction.isna().sum()
Out[7]:
satisfaction_v2                        0
gender                                 0
customer_type                          0
age                                    0
type_of_travel                         0
class                                  0
flight_distance                        0
seat_comfort                           0
departure_arrival_time_convenient      0
food_and_drink                         0
gate_location                          0
inflight_wifi_service                  0
inflight_entertainment                 0
online_support                         0
ease_of_online_booking                 0
on_board_service                       0
leg_room_service                       0
baggage_handling                       0
checkin_service                        0
cleanliness                            0
online_boarding                        0
departure_delay_in_minutes             0
arrival_delay_in_minutes             393
dtype: int64
In [8]:
missing = air_satisfaction['arrival_delay_in_minutes'].isnull().sum()
non_missing = air_satisfaction['arrival_delay_in_minutes'].notnull().sum()
values = [missing, non_missing]
labels = ['Missing', 'Non-Missing']

plt.pie(values, labels=labels, autopct='%1.1f%%')
plt.title(f'Missing vs Non-Missing Values in arrival_delay')
plt.show()
In [9]:
clean_air_satisfaction = air_satisfaction.dropna()
clean_air_satisfaction.isna().sum()
Out[9]:
satisfaction_v2                      0
gender                               0
customer_type                        0
age                                  0
type_of_travel                       0
class                                0
flight_distance                      0
seat_comfort                         0
departure_arrival_time_convenient    0
food_and_drink                       0
gate_location                        0
inflight_wifi_service                0
inflight_entertainment               0
online_support                       0
ease_of_online_booking               0
on_board_service                     0
leg_room_service                     0
baggage_handling                     0
checkin_service                      0
cleanliness                          0
online_boarding                      0
departure_delay_in_minutes           0
arrival_delay_in_minutes             0
dtype: int64
In [10]:
sns.boxplot(x=clean_air_satisfaction['age'])
Out[10]:
<Axes: xlabel='age'>
In [11]:
sns.boxplot(x=clean_air_satisfaction['flight_distance'])
Out[11]:
<Axes: xlabel='flight_distance'>
In [12]:
sns.boxplot(x=clean_air_satisfaction['departure_delay_in_minutes'])
Out[12]:
<Axes: xlabel='departure_delay_in_minutes'>
In [13]:
sns.boxplot(x=clean_air_satisfaction['arrival_delay_in_minutes'])
Out[13]:
<Axes: xlabel='arrival_delay_in_minutes'>
In [14]:
sort_age = clean_air_satisfaction.sort_values(['age'], ascending=False)
sort_flight_distance = clean_air_satisfaction.sort_values(['flight_distance'], ascending=False)
sort_depart_delay = clean_air_satisfaction.sort_values(['departure_delay_in_minutes'], ascending=False)
sort_arrival_delay = clean_air_satisfaction.sort_values(['arrival_delay_in_minutes'], ascending=False)

print(f"Top 5\n {sort_age[['age']].head()}")
print(f"Top 5\n {sort_flight_distance[['flight_distance']].head()}")
print(f"Top 20\n {sort_depart_delay[['departure_delay_in_minutes']].head(20)}")
print(f"Top 20\n {sort_arrival_delay[['arrival_delay_in_minutes']].head(20)}")
Top 5
         age
67916    85
92300    85
88349    85
54393    85
111268   85
Top 5
         flight_distance
49083              6951
102409             6950
69690              6948
27712              6924
15497              6907
Top 20
         departure_delay_in_minutes
9704                          1592
122928                        1305
17110                         1128
103605                        1017
3758                           978
67029                          951
99302                          933
80827                          930
52728                          921
110991                         859
73014                          853
76216                          815
3539                           794
74383                          756
127820                         750
75159                          748
73025                          729
5801                           726
5741                           724
10883                          692
Top 20
         arrival_delay_in_minutes
9704                      1584.0
122928                    1280.0
17110                     1115.0
103605                    1011.0
3758                       970.0
80827                      952.0
67029                      940.0
52728                      924.0
99302                      920.0
110991                     860.0
73014                      823.0
76216                      822.0
3539                       795.0
74383                      748.0
127820                     729.0
75159                      720.0
73025                      717.0
5741                       705.0
10883                      702.0
5801                       691.0
In [15]:
clean_air_satisfaction = clean_air_satisfaction.query('departure_delay_in_minutes < 652 & departure_delay_in_minutes < 691')
clean_air_satisfaction
Out[15]:
satisfaction_v2 gender customer_type age type_of_travel class flight_distance seat_comfort departure_arrival_time_convenient food_and_drink ... online_support ease_of_online_booking on_board_service leg_room_service baggage_handling checkin_service cleanliness online_boarding departure_delay_in_minutes arrival_delay_in_minutes
0 satisfied Female Loyal Customer 65 Personal Travel Eco 265 0 0 0 ... 2 3 3 0 3 5 3 2 0 0.0
1 satisfied Male Loyal Customer 47 Personal Travel Business 2464 0 0 0 ... 2 3 4 4 4 2 3 2 310 305.0
2 satisfied Female Loyal Customer 15 Personal Travel Eco 2138 0 0 0 ... 2 2 3 3 4 4 4 2 0 0.0
3 satisfied Female Loyal Customer 60 Personal Travel Eco 623 0 0 0 ... 3 1 1 0 1 4 1 3 0 0.0
4 satisfied Female Loyal Customer 70 Personal Travel Eco 354 0 0 0 ... 4 2 2 0 2 4 2 5 0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
129875 satisfied Female disloyal Customer 29 Personal Travel Eco 1731 5 5 5 ... 2 2 3 3 4 4 4 2 0 0.0
129876 neutral or dissatisfied Male disloyal Customer 63 Personal Travel Business 2087 2 3 2 ... 1 3 2 3 3 1 2 1 174 172.0
129877 neutral or dissatisfied Male disloyal Customer 69 Personal Travel Eco 2320 3 0 3 ... 2 4 4 3 4 2 3 2 155 163.0
129878 neutral or dissatisfied Male disloyal Customer 66 Personal Travel Eco 2450 3 2 3 ... 2 3 3 2 3 2 1 2 193 205.0
129879 neutral or dissatisfied Female disloyal Customer 38 Personal Travel Eco 4307 3 4 3 ... 3 4 5 5 5 3 3 3 185 186.0

129466 rows × 23 columns

In [16]:
clean_air_satisfaction['sum_delay'] = clean_air_satisfaction.apply(lambda x :(x.departure_delay_in_minutes + x.arrival_delay_in_minutes),axis = 1)
clean_air_satisfaction[['departure_delay_in_minutes','arrival_delay_in_minutes','sum_delay']]
Out[16]:
departure_delay_in_minutes arrival_delay_in_minutes sum_delay
0 0 0.0 0.0
1 310 305.0 615.0
2 0 0.0 0.0
3 0 0.0 0.0
4 0 0.0 0.0
... ... ... ...
129875 0 0.0 0.0
129876 174 172.0 346.0
129877 155 163.0 318.0
129878 193 205.0 398.0
129879 185 186.0 371.0

129466 rows × 3 columns

groupby¶

In [17]:
clean_air_satisfaction[['customer_type','age']].groupby('customer_type').agg(['count','min','max','mean']).round(2)
Out[17]:
age
count min max mean
customer_type
Loyal Customer 105754 7 85 41.47
disloyal Customer 23712 7 85 30.35
In [18]:
clean_air_satisfaction[['type_of_travel', 'satisfaction_v2', 'flight_distance']].groupby(['type_of_travel', 'satisfaction_v2']).agg(['count','min','max','mean']).round(2)
Out[18]:
flight_distance
count min max mean
type_of_travel satisfaction_v2
Business travel neutral or dissatisfied 37229 50 6951 2044.93
satisfied 52202 50 6950 2079.99
Personal Travel neutral or dissatisfied 21360 50 6924 1989.91
satisfied 18675 50 6792 1565.18
In [20]:
clean_air_satisfaction[['type_of_travel' ,'class', 'satisfaction_v2','flight_distance', 'sum_delay']].groupby(['type_of_travel', 'class', 'satisfaction_v2']).agg(['count','min','max','mean']).round(2)
Out[20]:
flight_distance sum_delay
count min max mean count min max mean
type_of_travel class satisfaction_v2
Business travel Business neutral or dissatisfied 16580 51 6951 2254.50 16580 0.0 1151.0 35.13
satisfied 42733 50 6950 2166.07 42733 0.0 1239.0 25.65
Eco neutral or dissatisfied 17679 50 6595 1881.55 17679 0.0 1166.0 35.39
satisfied 7550 50 6816 1691.66 7550 0.0 1114.0 26.11
Eco Plus neutral or dissatisfied 2970 54 6324 1847.54 2970 0.0 955.0 38.98
satisfied 1919 52 6733 1690.99 1919 0.0 824.0 23.84
Personal Travel Business neutral or dissatisfied 1424 55 6865 1458.28 1424 0.0 678.0 36.36
satisfied 1240 51 6591 1353.44 1240 0.0 725.0 21.88
Eco neutral or dissatisfied 17533 50 6924 2028.61 17533 0.0 1230.0 36.12
satisfied 15347 50 6792 1587.60 15347 0.0 867.0 20.18
Eco Plus neutral or dissatisfied 2403 50 6889 2022.57 2403 0.0 1137.0 36.08
satisfied 2088 50 6598 1526.13 2088 0.0 239.0 19.67
In [22]:
clean_air_satisfaction[['gender', 'satisfaction_v2', 'flight_distance', 'sum_delay']].groupby(['gender', 'satisfaction_v2']).agg(['count', 'mean', 'median']).round(2)
Out[22]:
flight_distance sum_delay
count mean median count mean median
gender satisfaction_v2
Female neutral or dissatisfied 22894 1924.02 1888.0 22894 39.40 5.0
satisfied 42799 1823.67 1776.0 42799 23.91 1.0
Male neutral or dissatisfied 35695 2089.56 1989.0 35695 33.44 3.0
satisfied 28078 2128.30 2044.0 28078 24.71 1.0
In [21]:
clean_air_satisfaction[['customer_type', 'satisfaction_v2', 'flight_distance', 'sum_delay']].groupby(['customer_type', 'satisfaction_v2']).agg(['count', 'mean', 'median']).round(2)
Out[21]:
flight_distance sum_delay
count mean median count mean median
customer_type satisfaction_v2
Loyal Customer neutral or dissatisfied 40565 2027.76 1959.0 40565 37.49 4.0
satisfied 65189 1936.47 1869.0 65189 24.13 1.0
disloyal Customer neutral or dissatisfied 18024 2018.38 1948.0 18024 31.90 3.0
satisfied 5688 2034.60 1985.0 5688 25.33 1.0

visualization¶

In [23]:
col_list = ['age','flight_distance','departure_delay_in_minutes','arrival_delay_in_minutes']
sns.heatmap(clean_air_satisfaction[col_list].corr(),annot=True,fmt=".2f")
plt.show()
In [24]:
sns.countplot(x='satisfaction_v2',data=clean_air_satisfaction)
plt.show()
In [25]:
sns.countplot(x='customer_type',hue='satisfaction_v2',data=clean_air_satisfaction)
plt.show()
In [26]:
sns.kdeplot(data=clean_air_satisfaction, x="age",hue='satisfaction_v2',multiple="stack")
plt.show()
In [27]:
sns.catplot(data=clean_air_satisfaction, x="age", y="class",hue='satisfaction_v2',kind='box')
plt.show()
C:\Users\porms\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)
In [28]:
sns.catplot(data=clean_air_satisfaction, x="satisfaction_v2", y="age", col="type_of_travel",kind="bar")
C:\Users\porms\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)
Out[28]:
<seaborn.axisgrid.FacetGrid at 0x29dce390650>
In [29]:
satisfaction_count = clean_air_satisfaction['satisfaction_v2'].value_counts()
plt.pie(satisfaction_count, labels=satisfaction_count.index, autopct='%1.1f%%')
plt.title('Pie Chart of Satisfaction')
plt.show()
In [30]:
import plotly.express as px
fig = px.scatter(clean_air_satisfaction, x='flight_distance', y='sum_delay', color='satisfaction_v2')
fig.show()
In [31]:
sns.scatterplot(data=clean_air_satisfaction, x='flight_distance',y='sum_delay', hue='satisfaction_v2')
Out[31]:
<Axes: xlabel='flight_distance', ylabel='sum_delay'>
C:\Users\porms\anaconda3\Lib\site-packages\IPython\core\events.py:93: UserWarning:

Creating legend with loc="best" can be slow with large amounts of data.

C:\Users\porms\anaconda3\Lib\site-packages\IPython\core\pylabtools.py:152: UserWarning:

Creating legend with loc="best" can be slow with large amounts of data.

In [32]:
sns.distplot(clean_air_satisfaction['flight_distance'])
plt.title('Histogram of flight distance' )  
plt.xlabel('flight_distance')                
C:\Users\porms\AppData\Local\Temp\ipykernel_15288\1216648016.py:1: UserWarning:



`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751


Out[32]:
Text(0.5, 0, 'flight_distance')
In [34]:
sns.pairplot(clean_air_satisfaction, hue="satisfaction_v2")
C:\Users\porms\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning:

The figure layout has changed to tight

Out[34]:
<seaborn.axisgrid.PairGrid at 0x25f0ce7a850>